Molecular design with Pretrained SAFE
%load_ext autoreload
%autoreload 2
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import safe as sf
import datamol as dm
# designer = sf.SAFEDesign.load_default(verbose=True)
--------------------------------------------------------------------------- HFValidationError Traceback (most recent call last) /home/hadim/Code/valence/Libs/safe/docs/tutorials/design-with-safe.ipynb Cell 3 line 1 ----> <a href='vscode-notebook-cell:/home/hadim/Code/valence/Libs/safe/docs/tutorials/design-with-safe.ipynb#W4sZmlsZQ%3D%3D?line=0'>1</a> designer = sf.SAFEDesign.load_default(verbose=True) File ~/Code/valence/Libs/safe/safe/sample.py:89, in SAFEDesign.load_default(cls, verbose, model_dir, device) 87 if model_dir is None or not model_dir: 88 model_dir = cls._DEFAULT_MODEL_PATH ---> 89 model = SAFEDoubleHeadsModel.from_pretrained(model_dir) 90 tokenizer = SAFETokenizer.load(os.path.join(model_dir, "tokenizer.json")) 91 gen_config = GenerationConfig.from_pretrained(model_dir) File ~/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/modeling_utils.py:2507, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs) 2504 if commit_hash is None: 2505 if not isinstance(config, PretrainedConfig): 2506 # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible -> 2507 resolved_config_file = cached_file( 2508 pretrained_model_name_or_path, 2509 CONFIG_NAME, 2510 cache_dir=cache_dir, 2511 force_download=force_download, 2512 resume_download=resume_download, 2513 proxies=proxies, 2514 local_files_only=local_files_only, 2515 token=token, 2516 revision=revision, 2517 subfolder=subfolder, 2518 _raise_exceptions_for_missing_entries=False, 2519 _raise_exceptions_for_connection_errors=False, 2520 ) 2521 commit_hash = extract_commit_hash(resolved_config_file, commit_hash) 2522 else: File ~/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/utils/hub.py:429, in cached_file(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs) 426 user_agent = http_user_agent(user_agent) 427 try: 428 # Load from URL or cache if already cached --> 429 resolved_file = hf_hub_download( 430 path_or_repo_id, 431 filename, 432 subfolder=None if len(subfolder) == 0 else subfolder, 433 repo_type=repo_type, 434 revision=revision, 435 cache_dir=cache_dir, 436 user_agent=user_agent, 437 force_download=force_download, 438 proxies=proxies, 439 resume_download=resume_download, 440 token=token, 441 local_files_only=local_files_only, 442 ) 443 except GatedRepoError as e: 444 raise EnvironmentError( 445 "You are trying to access a gated repo.\nMake sure to request access at " 446 f"https://huggingface.co/{path_or_repo_id} and pass a token having permission to this repo either " 447 "by logging in with `huggingface-cli login` or by passing `token=<your_token>`." 448 ) from e File ~/local/micromamba/envs/safe/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py:110, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs) 105 for arg_name, arg_value in chain( 106 zip(signature.parameters, args), # Args values 107 kwargs.items(), # Kwargs values 108 ): 109 if arg_name in ["repo_id", "from_id", "to_id"]: --> 110 validate_repo_id(arg_value) 112 elif arg_name == "token" and arg_value is not None: 113 has_token = True File ~/local/micromamba/envs/safe/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py:158, in validate_repo_id(repo_id) 155 raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_id}'.") 157 if repo_id.count("/") > 1: --> 158 raise HFValidationError( 159 "Repo id must be in the form 'repo_name' or 'namespace/repo_name':" 160 f" '{repo_id}'. Use `repo_type` argument if needed." 161 ) 163 if not REPO_ID_REGEX.match(repo_id): 164 raise HFValidationError( 165 "Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are" 166 " forbidden, '-' and '.' cannot start or end the name, max length is 96:" 167 f" '{repo_id}'." 168 ) HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/hadim/.cache/safe/default_model'. Use `repo_type` argument if needed.
# candidate_mol = "O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1"
# scaffold = "[*]N-c1ccc2ncnc(-N[*])c2c1" # this is for scaffold decoration
# superstructure = "c1ccc2ncncc2c1"
# side_chains = '[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1' # this is for scaffold morphing
# motif = "[*]-N1CCCCC1" # this is for motif extension
# linker_generation = ["[*]-N1CCCCC1", "Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1"] # this is for linker generation
We start with the image shown in the README of the SAFE GitHub Repository to assess the performance on the various tasks SAFE should in theory be good at.
# dm.to_image(dm.to_mol(candidate_mol))
# N_SAMPLES = 100
De novo generation¶
Generation of novel molecules without any constraints.
# generated = designer.de_novo_generation(sanitize=True, n_samples_per_trial=N_SAMPLES)
2023-08-29 16:58:21.338 | INFO | safe.sample:de_novo_generation:559 - After sanitization, 83 / 100 (83.00 %) generated molecules are valid !
# dm.to_image(generated[:20])
Scaffold Decoration¶
For scaffold decoration, we wish to generate new molecules that would contain a given scaffold as core. Usually, the attachment point on the scaffold should dictate where the new vectors will be added.
# dm.to_image(scaffold)
# generated = designer.scaffold_decoration(scaffold=scaffold, n_samples_per_trial=N_SAMPLES, n_trials=2, sanitize=True, do_not_fragment_further=True)
2023-08-29 16:59:49.566 | INFO | safe.sample:scaffold_decoration:534 - After sanitization, 182 / 200 (91.00 %) generated molecules are valid !
# dm.viz.lasso_highlight_image([dm.to_mol(x) for x in generated[:20]], dm.from_smarts(scaffold))
Super structure generation¶
In super structure generation, we just want to generate superstructure of a molecular subgraph
# dm.to_image(superstructure)
# generated = designer.super_structure(core=superstructure, n_samples_per_trial=N_SAMPLES, n_trials=1, sanitize=True, do_not_fragment_further=False, attachment_point_depth=3)
# #generated
2023-08-29 16:59:56.491 | INFO | safe.sample:super_structure:488 - After sanitization, 41 / 100 (41.00 %) generated molecules are valid !
# dm.to_image(generated[:20])
Motif Extension¶
In motif extension, we are interested in generating a molecule containing a given motif as starting point.
# dm.to_image(motif)
# # let's make some long sequence
# generated = designer.motif_extension(motif=motif, n_samples_per_trial=N_SAMPLES, n_trials=1, sanitize=True, do_not_fragment_further=False, min_length=25, max_length=80)
2023-08-29 17:01:55.965 | INFO | safe.sample:scaffold_decoration:534 - After sanitization, 98 / 100 (98.00 %) generated molecules are valid !
# dm.to_image(generated[:20])
Scaffold Morphing¶
In scaffold morphing, we wish to replace a scaffold by another one in a molecule. The process requires as input that the user provides either the side chains or the input molecules and the core
# dm.to_image(side_chains)
# generated = designer.scaffold_morphing(side_chains=side_chains, n_samples_per_trial=N_SAMPLES, n_trials=1, sanitize=True, do_not_fragment_further=False, random_seed=100)
# dm.to_image(generated[:20])
2023-08-29 17:00:14.942 | INFO | safe.sample:_fragment_linking:389 - After sanitization, 100 / 100 (100.00 %) generated molecules are valid !
Linker generation¶
Linker generation is mostly the same thing as scaffold morphing ...
# dm.to_image(linker_generation)
# generated = designer.linker_generation(*linker_generation, n_samples_per_trial=N_SAMPLES, n_trials=1, sanitize=True, do_not_fragment_further=False, random_seed=100)
# dm.to_image(generated[:20])
2023-08-29 17:00:27.044 | INFO | safe.sample:_fragment_linking:389 - After sanitization, 100 / 100 (100.00 %) generated molecules are valid !